WIP: New function propop::propop_tables()

Run projections with tibbles, purrr and dplyr

Author

Statistik Aargau (2025), Norah Efosa

Published

November 1, 2025

Introduction

propop::propop_tables() is a dplyr-version of propop::propop() (matrices).

✅modular structure

✅uses dplyr

✅more flexibility for adding/removing columns and new modules

🟨 for now, both functions will be kept and maintained -> to be discussed

Overview: propop_tables()

Function arguments

The new function’s arguments are identical to propop::propop():

(only the first four arguments in blue are mandatory)

propop::propop_tables(
  parameters,
  population,
  year_first,
  year_last,
  scenarios = NULL,
  age_groups = 101,
  fert_first = 16,
  fert_last = 50,
  share_born_female = 100 / 205,
  subregional = NULL, # = "net" for net migration; = "rate" for emi- and immigration rates
  binational = TRUE,
  spatial_unit = "spatial_unit"
)

Structure

Excerpt from propop_tables()

# [...]
# Run projection ----
# iterate across spatial units, scenarios and years
df_result <- purrr::reduce(
  .x = list_parameters,
  .f = \(population, parameters) project_population(
    population, parameters,
    subregional = subregional
  ),
  .init = init_population
)
# [...]

Performance for one region (Canton of Aargau)

We use STAT-TAB data as described in the vignette for projecting a single region:

# Run propop with tables (new feature)
system.time({
  result_tables <- propop::propop_tables(
    parameters = fso_parameters,
    year_first = 2024,
    year_last = 2055,
    population = fso_population,
    binational = TRUE
  )
})
   user  system elapsed 
 22.303   0.045  22.440 
# Run propop with matrices (original)
system.time({
  result_matrices <- propop::propop(
    parameters = fso_parameters,
    year_first = 2024,
    year_last = 2055,
    population = fso_population,
    binational = TRUE
  )
})
   user  system elapsed 
  5.760   0.040   5.839 

Comparison between FSO-results, propop::propop_tables() and propop::propop()

Performance for five subregions

Show/hide example code
# FSO parameters for fictitious subregions
fso_parameters_sub <- fso_parameters |>
  # duplicating rows 5 times
  tidyr::uncount(5) |>
  # create 5 subregions
  dplyr::mutate(spatial_unit = rep(1:5, times = nrow(fso_parameters))) |>
  dplyr::mutate(spatial_unit = as.character(spatial_unit))

# Generate 5 random "cuts" to distribute the original population;
# avoid extreme values with a range of 0.1 to 0.5
cut_1 <- {
  set.seed(1)
  round(runif(1, min = 0.1, max = 0.5), digits = 2)
}
cut_2 <- {
  set.seed(2)
  round(runif(1, min = 0.1, max = 0.5), digits = 2)
}
cut_3 <- {
  set.seed(3)
  round(runif(1, min = 0.1, max = 0.5), digits = 2)
}
cut_4 <- {
  set.seed(4)
  round(runif(1, min = 0.1, max = 0.5), digits = 2)
}
# make sure everything adds up to 100%
cut_5 <- 1 - cut_1 - cut_2 - cut_3 - cut_4

# Generate population data for five subregions
df_population_sub <- fso_population |>
  # duplicating rows 5 times
  tidyr::uncount(5) |>
  # create 5 subregions
  dplyr::mutate(
    spatial_unit = as.character(rep(1:5, times = nrow(fso_population)))
  ) |>
  dplyr::mutate(
    # Distribute original population according to "cuts"
    n = dplyr::case_match(
      spatial_unit,
      "1" ~ round(n * cut_1),
      "2" ~ round(n * cut_2),
      "3" ~ round(n * cut_3),
      "4" ~ round(n * cut_4),
      "5" ~ round(n * cut_5),
      .default = NA
    ),
    .keep = "all"
  )

# Prepare subregional migration
parameters_sub_mig <- fso_parameters_sub |>
  # Create fictitious migration parameters
  dplyr::mutate(
    mig_sub = dplyr::case_when(
      # Four regions with emigration, 1 region with immigration
      spatial_unit == 1 ~ {
        set.seed(1)
        round(rnorm(1, mean = 0, sd = 0.2), digits = 4)
      },
      spatial_unit == 2 ~ {
        set.seed(2)
        round(rnorm(1, mean = 0, sd = 0.2), digits = 4)
      },
      spatial_unit == 3 ~ {
        set.seed(25)
        round(rnorm(1, mean = 0, sd = 0.2), digits = 4)
      },
      spatial_unit == 4 ~ {
        set.seed(12)
        round(rnorm(1, mean = 0, sd = 0.2), digits = 4)
      },
      TRUE ~ NA
    )
  ) |>
  dplyr::mutate(
    mig_sub = dplyr::case_when(
      spatial_unit == 5 ~ 0 - sum(mig_sub, na.rm = TRUE), TRUE ~ mig_sub
    ),
    check = sum(mig_sub, na.rm = TRUE),
    .by = c("nat", "sex", "age", "year", "scen")
  ) |>
  dplyr::select(
    nat, sex, age, year, scen, spatial_unit, birthrate, int_mothers, mor,
    emi_int, emi_nat, imm_int_n, imm_nat_n, acq, emi_nat_n, mig_nat_n, mig_sub
  )

# Calculate shares
df_population_shares <- df_population_sub |>
  dplyr::mutate(sum_n = sum(n), .by = c(nat, sex, age)) |>
  dplyr::mutate(
    share = n / sum_n,
    share = case_when(
      sum_n == 0 ~ 0,
      .default = share
    )
  )

# Apply shares
parameters_sub_size <- parameters_sub_mig |>
  dplyr::left_join(
    df_population_shares |>
      dplyr::select("spatial_unit", "nat", "sex", "age", "share"),
    by = c("spatial_unit", "nat", "sex", "age")
  ) |>
  dplyr::mutate(
    # Calculate number of incoming people per demographic group and spatial unit
    imm_int_n_distr = imm_int_n * share,
    imm_nat_n_distr = imm_nat_n * share
  ) |>
  dplyr::mutate(
    imm_int_n = imm_int_n_distr,
    imm_nat_n = imm_nat_n_distr
  )
# Run propop with tables
system.time({
  result_tables_sub <- propop::propop_tables(
    parameters = parameters_sub_size,
    year_first = 2024,
    year_last = 2055,
    population = df_population_sub,
    subregional = "net",
    binational = TRUE
  )
})
   user  system elapsed 
 55.507   0.084  55.772 
# Run propop with matrices
system.time({
  result_matrices_sub <- propop::propop(
    parameters = parameters_sub_size,
    year_first = 2024,
    year_last = 2055,
    population = df_population_sub,
    subregional = TRUE,
    binational = TRUE
  )
})
   user  system elapsed 
 19.900   0.040  20.029 

Next steps

Fine tuning for rates (esp. people aged 80 years and older)
☒ No bugs were found. Results for population components in the output still differ a bit from the original (propop()), but only for components that are based on rates.
☒ Comparison with FSO-results

Subregions
propop_tables() runs for subregions
☒ Two methods enabled for the subregional distribution of migration (net numbers or rates)
☐ Evaluate computation speed and accuracy (tests with real data)

Code cleaning, function-feedback, package tests
☒ Code is more or less clean
☒ Function feedback is there
☐ Package tests (mostly completed, only missing crossover tests between the matrices and tables)
☐ Documentation (e.g. a new vignette)

Integration into the main branch: should propop_tables() remain as a separate function or become an argument to select in propop()?)